library(tidyverse)
![]() |
Import & Write
1 오픈데이터 분석 실습 : Import/Write Data
1.1 패키지 불러오기
- read_csv 기능 존재
2 Import Data
2.1 read_csv
- read.csv와 다르게 문자열 factor 처리X
### 현재 경로 확인
getwd()
[1] "G:/내 드라이브/taek_blog/posts/Opendata_Analysis Ch3"
### 경로 설정
#setwd()
### csv파일 불러오기
<- read_csv("heights.csv")
heights heights
# A tibble: 1,192 × 6
earn height sex ed age race
<dbl> <dbl> <chr> <dbl> <dbl> <chr>
1 50000 74.4 male 16 45 white
2 60000 65.5 female 16 58 white
3 30000 63.6 female 16 29 white
4 50000 63.1 female 16 91 other
5 51000 63.4 female 17 39 white
6 9000 64.4 female 15 26 white
7 29000 61.7 female 12 49 white
8 32000 72.7 male 17 46 white
9 2000 72.0 male 15 21 hispanic
10 27000 72.2 male 12 26 white
# … with 1,182 more rows
### 절대경로 사용
<- read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/heights.csv")
heights heights
# A tibble: 1,192 × 6
earn height sex ed age race
<dbl> <dbl> <chr> <dbl> <dbl> <chr>
1 50000 74.4 male 16 45 white
2 60000 65.5 female 16 58 white
3 30000 63.6 female 16 29 white
4 50000 63.1 female 16 91 other
5 51000 63.4 female 17 39 white
6 9000 64.4 female 15 26 white
7 29000 61.7 female 12 49 white
8 32000 72.7 male 17 46 white
9 2000 72.0 male 15 21 hispanic
10 27000 72.2 male 12 26 white
# … with 1,182 more rows
### tibble 데이터 프레임 생성
read_csv("a,b,c
1,2,3
4,5,6")
# A tibble: 2 × 3
a b c
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
### 라인 스킵
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3" , skip=2)
# A tibble: 1 × 3
x y z
<dbl> <dbl> <dbl>
1 1 2 3
### 주석 스킵
read_csv("#A comment I want to skip
x,y,z
1,2,3", comment="#")
# A tibble: 1 × 3
x y z
<dbl> <dbl> <dbl>
1 1 2 3
### 컬럼 이름 없이 내용만
read_csv("1,2,3
4,5,6", col_names = F)
# A tibble: 2 × 3
X1 X2 X3
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
### '\n' : 한줄 띄우기
read_csv("1,2,3 \n 4,5,6,", col_names = F)
# A tibble: 2 × 3
X1 X2 X3
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
### 컬럼 이름 지정
read_csv("1,2,3 \n 4,5,6,", col_names = c("A","B","C"))
# A tibble: 2 × 3
A B C
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
### NA값 부여
read_csv("a,b,c \n 1,2,.", na=".")
# A tibble: 1 × 3
a b c
<dbl> <dbl> <lgl>
1 1 2 NA
2.2 Locale 설정/확인
Sys.getlocale()
[1] "LC_COLLATE=Korean_Korea.utf8;LC_CTYPE=ko_KR.UTF-8;LC_MONETARY=Korean_Korea.utf8;LC_NUMERIC=C;LC_TIME=Korean_Korea.utf8"
### 언어 영어로
#Sys.setlocale("LC_ALL", "English")
### 강제 언어 삭제
#Sys.setlocale("LC_ALL", "C")
2.3 한글 파일 읽기
### 인코딩 찾기
guess_encoding("exercise.csv")
# A tibble: 2 × 2
encoding confidence
<chr> <dbl>
1 EUC-KR 1
2 IBM420_ltr 0.25
### 인코딩 입력으로 에러해결
<- read_csv("exercise.csv", locale = locale(encoding = "EUC-KR"))
exercise exercise
# A tibble: 5 × 2
이름 선호도
<chr> <dbl>
1 하민 5
2 하준 4
3 하진 4
4 태산 3
5 태민 2
### csv파일을 미리 열어보고 인코딩 변경
<- read_csv("exercise_utf_8.csv")
exercise exercise
# A tibble: 5 × 2
이름 선호도
<chr> <dbl>
1 하민 5
2 하준 4
3 하진 4
4 태산 3
5 태민 2
guess_encoding("exercise_utf_8.csv")
# A tibble: 3 × 2
encoding confidence
<chr> <dbl>
1 UTF-8 1
2 windows-1255 0.38
3 windows-1255 0.29
3 Write Data
3.1 파일 저장/삭제
<- read_csv("heights.csv")
heights heights
# A tibble: 1,192 × 6
earn height sex ed age race
<dbl> <dbl> <chr> <dbl> <dbl> <chr>
1 50000 74.4 male 16 45 white
2 60000 65.5 female 16 58 white
3 30000 63.6 female 16 29 white
4 50000 63.1 female 16 91 other
5 51000 63.4 female 17 39 white
6 9000 64.4 female 15 26 white
7 29000 61.7 female 12 49 white
8 32000 72.7 male 17 46 white
9 2000 72.0 male 15 21 hispanic
10 27000 72.2 male 12 26 white
# … with 1,182 more rows
### 현재 경로에 csv파일 저장
write_csv(heights, "만들 파일 이름.csv")
### rds 확장자
write_rds(heights, "만들 파일 이름.rds")
read_rds("만들 파일 이름.rds")
# A tibble: 1,192 × 6
earn height sex ed age race
<dbl> <dbl> <chr> <dbl> <dbl> <chr>
1 50000 74.4 male 16 45 white
2 60000 65.5 female 16 58 white
3 30000 63.6 female 16 29 white
4 50000 63.1 female 16 91 other
5 51000 63.4 female 17 39 white
6 9000 64.4 female 15 26 white
7 29000 61.7 female 12 49 white
8 32000 72.7 male 17 46 white
9 2000 72.0 male 15 21 hispanic
10 27000 72.2 male 12 26 white
# … with 1,182 more rows
### 파일 삭제
file.remove("만들 파일 이름.csv")
[1] TRUE
3.2 feather 패키지
#install.packages("feather")
library(feather)
write_feather(heights, "heights.feather")
read_feather("heights.feather")
# A tibble: 1,192 × 6
earn height sex ed age race
<dbl> <dbl> <chr> <dbl> <dbl> <chr>
1 50000 74.4 male 16 45 white
2 60000 65.5 female 16 58 white
3 30000 63.6 female 16 29 white
4 50000 63.1 female 16 91 other
5 51000 63.4 female 17 39 white
6 9000 64.4 female 15 26 white
7 29000 61.7 female 12 49 white
8 32000 72.7 male 17 46 white
9 2000 72.0 male 15 21 hispanic
10 27000 72.2 male 12 26 white
# … with 1,182 more rows